#!/bin/bash
num_nodes=30
line_number=$(($1+1))
result=$(sed -n "${line_number}p" /share/projset/dsir7/scripts/dclm.txt)
SCRIPT_PATH=/share/projset/dsir7
cd $SCRIPT_PATH
CONFIG_FILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/feat_config.json
RAW_DATASETS=$result
name=`basename $result .jsonl`
TARGET_DATASETS=/share/projset/dsir7/scripts/piqa+hellaswag_natural_text.jsonl
NUM_TO_SAMPLE=1000000
MIN_EXAMPLE_LENGTH=0
MERGED_DIR=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/"$name"_piqahellaswag-merge_0.0_0/resampled
OUTFILE=/share/projset/dsir7/indexes/qwen_tokenizer_3_300000/"$name"-piqahellaswag.jsonl
# echo $MERGED_DIR
# echo $OUTFILE
bash scripts/search.sh $CONFIG_FILE $RAW_DATASETS $TARGET_DATASETS $NUM_TO_SAMPLE $MIN_EXAMPLE_LENGTH
wait
bash scripts/merged_jsonl.sh $MERGED_DIR $OUTFILE
echo "FINISHED"